Amanda Birmingham, CCBB, UCSD (abirmingham@ucsd.edu)
To run this notebook reproducibly, follow these steps:
In [ ]:
g_num_processors = 3
g_fastqs_dir = '~/dual_crispr/test_data/test_set_1'
g_trimmed_fastqs_dir = '~/dual_crispr/test_outputs/test_set_1'
g_full_5p_r1 = 'TATATATCTTGTGGAAAGGACGAAACACCG'
g_full_5p_r2 = 'CCTTATTTTAACTTGCTATTTCTAGCTCTAAAAC'
g_full_3p_r1 = 'GTTTCAGAGCTATGCTGGAAACTGCATAGCAAGTTGAAATAAGGCTAGTCCGTTATCAACTTGAAAAAGTGGCACCGAGTCGGTGCTTTTTTGTACTGAG'
g_full_3p_r2 = 'CAAACAAGGCTTTTCTCCAAGGGATATTTATAGTCTCAAAACACACAATTACTTTACAGTTAGGGTGAGTTTCCTTTTGTGCTGTTTTTTAAAATA'
g_keep_gzs = False # True only works for gzip 1.6+ (apparently not available on AWS linux)
In [ ]:
import inspect
import ccbb_pyutils.analysis_run_prefixes as ns_runs
import ccbb_pyutils.files_and_paths as ns_files
import ccbb_pyutils.notebook_logging as ns_logs
def describe_var_list(input_var_name_list):
description_list = ["{0}: {1}\n".format(name, eval(name)) for name in input_var_name_list]
return "".join(description_list)
ns_logs.set_stdout_info_logger()
In [ ]:
g_fastqs_dir = ns_files.expand_path(g_fastqs_dir)
g_trimmed_fastqs_dir = ns_files.expand_path(ns_runs.check_or_set(g_trimmed_fastqs_dir, g_fastqs_dir))
print(describe_var_list(['g_fastqs_dir','g_trimmed_fastqs_dir']))
ns_files.verify_or_make_dir(g_trimmed_fastqs_dir)
In [ ]:
import dual_crispr.scaffold_trim as trim
print(inspect.getsource(trim))
In [ ]:
def trim_fw_and_rv_reads(output_dir, full_5p_r1, full_3p_r1, full_5p_r2, full_3p_r2, fw_fastq_fp, rv_fastq_fp):
trim.trim_linked_scaffold(output_dir, fw_fastq_fp, full_5p_r1, full_3p_r1)
trim.trim_linked_scaffold(output_dir, rv_fastq_fp, full_5p_r2, full_3p_r2)
In [ ]:
g_seq_file_ext_name = ".fastq"
g_gzip_ext_name = ".gz"
In [ ]:
print(ns_files.check_file_presence(g_fastqs_dir, "", "{0}{1}".format(g_seq_file_ext_name, g_gzip_ext_name),
all_subdirs=True, check_failure_msg=None, just_warn=True))
In [ ]:
import ccbb_pyutils.files_and_paths as ns_files
def unzip_and_flatten_seq_files(top_fastqs_dir, ext_name, gzip_ext_name, keep_gzs):
# first, recursively unzip all fastq.gz files anywhere under the input dir
ns_files.gunzip_wildpath(top_fastqs_dir, ext_name + gzip_ext_name, keep_gzs, True) # True = do recursive
# now move all fastqs to top-level directory so don't have to work recursively in future
ns_files.move_to_dir_and_flatten(top_fastqs_dir, top_fastqs_dir, ext_name)
In [ ]:
# False = don't keep gzs as well as expanding, True = do keep them (True only works for gzip 1.6+)
unzip_and_flatten_seq_files(g_fastqs_dir, g_seq_file_ext_name, g_gzip_ext_name, g_keep_gzs)
In [ ]:
print(ns_files.check_file_presence(g_fastqs_dir, "", g_seq_file_ext_name,
check_failure_msg="No fastq files to trim were detected."))
In [ ]:
import ccbb_pyutils.parallel_process_fastqs as ns_parallel
g_parallel_results = ns_parallel.parallel_process_paired_reads(g_fastqs_dir, g_seq_file_ext_name, g_num_processors,
trim_fw_and_rv_reads, [g_trimmed_fastqs_dir, g_full_5p_r1,
g_full_3p_r1, g_full_5p_r2, g_full_3p_r2])
In [ ]:
print(ns_parallel.concatenate_parallel_results(g_parallel_results))
In [ ]:
print(ns_files.check_file_presence(g_trimmed_fastqs_dir, "", trim.get_trimmed_suffix(trim.TrimType.FIVE_THREE),
check_failure_msg="Scaffold trimming failed to produce trimmed file(s)."))